In [1]:
# Analysis can be run locally and then migrate to crunchy
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from scipy import stats
from scipy.stats import chi2_contingency
import statsmodels.api as sm
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
# from sklearn.metrics import root_mean_squared_error
from sklearn import datasets, linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from xgboost import plot_importance
In [57]:
#!pip list --format=freeze > requirements.txt
Load Training Data¶
In [2]:
directory = r'C:\Users\yingl\OneDrive\Desktop\MultiorePerformancePrediction\MultiorePerformancePrediction\data\training_data'
def collect_training_data(directory = ''):
lst = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".csv"):
lst.append(pd.read_csv(os.path.join(root, file)))
df_raw = pd.concat(lst)
print(f'length starts: {len(df_raw)}')
for col in ['branch-instructions', 'branch-misses', 'cache-misses', 'cache-references', 'cpu-cycles', 'instructions',
'stalled-cycles-frontend', 'L1-icache-load-misses', 'LLC-load-misses', 'LLC-loads', 'LLC-stores',
'L1-dcache-prefetch-misses', 'L1-dcache-prefetches', 'L1-icache-loads',
'branch-load-misses', 'branch-loads', 'dTLB-load-misses', 'dTLB-loads', 'iTLB-load-misses', 'iTLB-loads']:
df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')
#df_raw.to_csv('training_data_all.csv', index=False)
#df_raw[numeric_feature_lst].corr().to_csv('corr.csv')
return df_raw
df_raw = collect_training_data(directory)
# def f(row):
# if any([row['run_id'].startswith(i) for i in ['bfs', 'lavaMD', 'kmeans', 'myocyte']]):
# return 'rodinia'
# else:
# return 'parsec'
# # df_raw['benchmark'] = df_raw.apply(f, axis=1)
df_raw['runtime_serial'] = df_raw['speed_up'] * df_raw['compute_time']
df_raw['IPC'] = df_raw['instructions'] / df_raw['cpu-cycles']
df_raw['IPS'] = df_raw['instructions'] / df_raw['compute_time']
length starts: 2623
In [3]:
print(set(df_raw['hostname']))
{'cuda1', 'crunchy5', 'crackle3', 'crackle5', 'snappy1', 'crunchy1', 'crackle1', 'snappy4', 'crunchy6'}
Anomaly Detection¶
In [4]:
sns.displot(df_raw[['speed_up', 'benchmark']], x="speed_up", hue="benchmark")
plt.show()
C:\Users\yingl\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
In [5]:
df_raw = df_raw.loc[df_raw['speed_up'] <= 100]
sns.displot(df_raw[['speed_up', 'benchmark']], x="speed_up", hue="benchmark")
plt.show()
C:\Users\yingl\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
In [6]:
plt.figure(figsize=(10, 6))
sns.set(style='whitegrid')
sns.boxplot(x="threads",
y="speed_up",
data=df_raw[['speed_up', 'threads', 'benchmark']])
Out[6]:
<Axes: xlabel='threads', ylabel='speed_up'>
In [53]:
fig, axes = plt.subplots(3, 4, figsize=(10, 8))
p_list = sorted(list(set(df_raw['program'])))
for i, ax in enumerate(axes.flatten()):
p = p_list[i]
df_slice = df_raw[['threads', 'speed_up', 'program']].loc[df_raw['program'] == p].groupby(['program', 'threads']).mean().reset_index()
ax.plot(df_slice['threads'], df_slice['speed_up'], linewidth=1.0)
ax.set_title(p, size=10)
fig.supxlabel('threads')
fig.supylabel('speed up')
plt.show()
In [8]:
fig, axes = plt.subplots(3, 3, figsize=(10, 8))
h_list = sorted(list(set(df_raw['hostname'])))
for i, ax in enumerate(axes.flatten()):
h = h_list[i]
df_slice = df_raw[['threads', 'speed_up', 'hostname']].loc[df_raw['hostname'] == h].groupby(['hostname', 'threads']).mean().reset_index()
ax.plot(df_slice['threads'], df_slice['speed_up'], linewidth=1.0)
ax.set_title(h, size=10)
fig.supxlabel('threads')
fig.supylabel('speed up')
plt.show()
Missing Data¶
In [9]:
class_feature_lst = []
numeric_feature_lst = []
for col in list(df_raw):
null_pct = df_raw[col].isnull().sum() / len(df_raw)
if null_pct <= 0.2:
#print(col, null_pct, df_raw.dtypes[col], df_raw.iloc[0])
if df_raw.dtypes[col] == 'object':
class_feature_lst.append(col)
else:
numeric_feature_lst.append(col)
numeric_feature_lst.remove('speed_up')
numeric_feature_lst.remove('compute_time')
print(numeric_feature_lst)
print(class_feature_lst)
['branch-instructions', 'branch-misses', 'cache-misses', 'cache-references', 'cpu-cycles', 'instructions', 'stalled-cycles-frontend', 'alignment-faults', 'bpf-output', 'context-switches', 'cpu-clock', 'cpu-migrations', 'dummy', 'emulation-faults', 'major-faults', 'minor-faults', 'page-faults', 'task-clock', 'L1-dcache-load-misses', 'L1-dcache-loads', 'L1-dcache-prefetch-misses', 'L1-icache-load-misses', 'LLC-load-misses', 'LLC-loads', 'LLC-stores', 'branch-load-misses', 'branch-loads', 'dTLB-load-misses', 'dTLB-loads', 'iTLB-load-misses', 'iTLB-loads', 'msr/aperf/', 'msr/mperf/', 'msr/tsc/', 'threads', 'host_cpu_user', 'host_cpu_system', 'host_cpu_idle', 'host_memused', 'CPU(s)', 'Thread(s) per core', 'Core(s) per socket', 'Socket(s)', 'NUMA node(s)', 'CPU family', 'Model', 'Stepping', 'CPU MHz', 'BogoMIPS', 'runtime_serial', 'IPC', 'IPS'] ['size', 'run_time', 'benchmark', 'run_id', 'program', 'hostname', 'Architecture', 'CPU op-mode(s)', 'Byte Order', 'On-line CPU(s) list', 'Vendor ID', 'Model name', 'Virtualization', 'L1d cache', 'L1i cache', 'L2 cache', 'L3 cache', 'NUMA node0 CPU(s)', 'NUMA node1 CPU(s)', 'total_memory']
Feature Examine¶
In [10]:
# take out null
df = df_raw[numeric_feature_lst + ['speed_up']]
sns.heatmap(df.corr())
# for col in list(df):
# df[col] = pd.to_numeric(df[col], errors='coerce')
df.dropna(inplace=True)
print(f'length after dropping: {len(df)}')
# return df
# df = preprocess_training(df_raw)
length after dropping: 2234
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\2929058954.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.dropna(inplace=True)
Numerical Feature (p value)¶
In [11]:
X = df[numeric_feature_lst]
y = df['speed_up']
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: speed_up R-squared: 0.430
Model: OLS Adj. R-squared: 0.419
Method: Least Squares F-statistic: 40.35
Date: Tue, 16 Apr 2024 Prob (F-statistic): 6.70e-234
Time: 22:57:25 Log-Likelihood: -6778.2
No. Observations: 2234 AIC: 1.364e+04
Df Residuals: 2192 BIC: 1.388e+04
Df Model: 41
Covariance Type: nonrobust
=============================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------
const -20.7715 12.169 -1.707 0.088 -44.636 3.093
branch-instructions 3.453e-09 1.45e-09 2.379 0.017 6.07e-10 6.3e-09
branch-misses 9.903e-09 3.24e-09 3.058 0.002 3.55e-09 1.63e-08
cache-misses -1.27e-08 3.9e-08 -0.325 0.745 -8.92e-08 6.38e-08
cache-references -2.552e-08 1.52e-08 -1.680 0.093 -5.53e-08 4.27e-09
cpu-cycles -1.057e-09 4.79e-10 -2.207 0.027 -2e-09 -1.18e-10
instructions 3.841e-10 6.02e-11 6.383 0.000 2.66e-10 5.02e-10
stalled-cycles-frontend -6.471e-12 6.29e-11 -0.103 0.918 -1.3e-10 1.17e-10
alignment-faults 0.0251 0.015 1.702 0.089 -0.004 0.054
bpf-output -0.0979 0.057 -1.706 0.088 -0.211 0.015
context-switches 2.298e-05 1.13e-05 2.035 0.042 8.35e-07 4.51e-05
cpu-clock 0.0148 0.012 1.285 0.199 -0.008 0.037
cpu-migrations -3.168e-05 1.96e-05 -1.616 0.106 -7.01e-05 6.76e-06
dummy 0.3488 0.204 1.706 0.088 -0.052 0.750
emulation-faults -0.0120 0.007 -1.702 0.089 -0.026 0.002
major-faults -1.1383 5.088 -0.224 0.823 -11.116 8.839
minor-faults -0.3345 0.231 -1.446 0.148 -0.788 0.119
page-faults 0.3345 0.231 1.446 0.148 -0.119 0.788
task-clock -0.0140 0.011 -1.232 0.218 -0.036 0.008
L1-dcache-load-misses -1.037e-09 3.53e-09 -0.294 0.769 -7.95e-09 5.88e-09
L1-dcache-loads -5.842e-10 8.84e-10 -0.661 0.509 -2.32e-09 1.15e-09
L1-dcache-prefetch-misses 2.327e-09 1.61e-09 1.446 0.148 -8.28e-10 5.48e-09
L1-icache-load-misses -4.669e-09 6.48e-09 -0.720 0.472 -1.74e-08 8.04e-09
LLC-load-misses -1.466e-08 4.17e-08 -0.351 0.725 -9.65e-08 6.72e-08
LLC-loads 2.947e-08 1.49e-08 1.982 0.048 3.19e-10 5.86e-08
LLC-stores -9.504e-09 5.26e-09 -1.807 0.071 -1.98e-08 8.08e-10
branch-load-misses -1.22e-09 1.83e-10 -6.663 0.000 -1.58e-09 -8.61e-10
branch-loads -3.228e-09 1.44e-09 -2.239 0.025 -6.06e-09 -4.01e-10
dTLB-load-misses -1.309e-07 2.86e-08 -4.574 0.000 -1.87e-07 -7.48e-08
dTLB-loads -5.833e-10 8.87e-10 -0.657 0.511 -2.32e-09 1.16e-09
iTLB-load-misses 6.009e-07 2.34e-07 2.571 0.010 1.43e-07 1.06e-06
iTLB-loads 3.77e-13 1.83e-11 0.021 0.984 -3.55e-11 3.63e-11
msr/aperf/ 9.895e-10 4.76e-10 2.077 0.038 5.51e-11 1.92e-09
msr/mperf/ -2.259e-09 3.08e-09 -0.733 0.464 -8.3e-09 3.79e-09
msr/tsc/ 1.947e-09 3.13e-09 0.621 0.534 -4.2e-09 8.09e-09
threads 0.0102 0.003 3.421 0.001 0.004 0.016
host_cpu_user 0.0119 0.692 0.017 0.986 -1.345 1.369
host_cpu_system -0.8805 1.644 -0.536 0.592 -4.104 2.343
host_cpu_idle 0.0311 0.694 0.045 0.964 -1.330 1.392
host_memused 0.0431 0.102 0.422 0.673 -0.157 0.243
CPU(s) -58.4822 34.204 -1.710 0.087 -125.559 8.594
Thread(s) per core -94.9821 55.651 -1.707 0.088 -204.116 14.152
Core(s) per socket 228.3077 133.787 1.707 0.088 -34.054 490.670
Socket(s) -40.0564 23.466 -1.707 0.088 -86.074 5.961
NUMA node(s) -33.0120 19.335 -1.707 0.088 -70.930 4.906
CPU family -104.3193 61.104 -1.707 0.088 -224.147 15.509
Model -109.1971 63.935 -1.708 0.088 -234.577 16.183
Stepping 59.2282 34.708 1.706 0.088 -8.836 127.293
CPU MHz 7.166e-05 0.000 0.273 0.785 -0.000 0.001
BogoMIPS 1.1269 0.660 1.709 0.088 -0.167 2.420
runtime_serial 0.6122 0.101 6.067 0.000 0.414 0.810
IPC -8.1354 0.535 -15.211 0.000 -9.184 -7.087
IPS 5.732e-11 1.83e-12 31.363 0.000 5.37e-11 6.09e-11
==============================================================================
Omnibus: 1798.204 Durbin-Watson: 0.708
Prob(Omnibus): 0.000 Jarque-Bera (JB): 132581.852
Skew: 3.253 Prob(JB): 0.00
Kurtosis: 40.175 Cond. No. 3.23e+16
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.23e+16. This might indicate that there are
strong multicollinearity or other numerical problems.
Categorical Feature (chi test)¶
In [12]:
for to_test in class_feature_lst:
# Create a DataFrame
#to_test = 'Architecture'
df_cat_test = df_raw[['speed_up', to_test]]
# Create a contingency table
table = pd.crosstab(df_cat_test[to_test], df_cat_test['speed_up'])
# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(table)
# Interpret the results
if p < 0.1:
print('Y --', to_test, p)
else:
print('N --', to_test, p)
N -- size 0.3388603240072237 N -- run_time 0.25517469787142555 N -- benchmark 0.3760295656969437 Y -- run_id 0.05050658245661779 N -- program 0.2504308114994181 N -- hostname 0.2894810901816114 N -- Architecture 1.0 N -- CPU op-mode(s) 1.0 N -- Byte Order 1.0 N -- On-line CPU(s) list 0.3567573935697952 N -- Vendor ID 0.42482723426855573 N -- Model name 0.35675739356979225 N -- Virtualization 0.42482723426855573 N -- L1d cache 0.42482723426855573 N -- L1i cache 0.42482723426855573 N -- L2 cache 0.42482723426855573 N -- L3 cache 0.3567573935697952 N -- NUMA node0 CPU(s) 0.3567573935697952 N -- NUMA node1 CPU(s) 0.3567573935697952 N -- total_memory 0.38568132475947114
PCA for Virtulization¶
In [13]:
df_pca = df_raw[numeric_feature_lst + ['benchmark']]
df_pca.dropna(inplace=True)
d = {}
d['data'] = df_pca[numeric_feature_lst]
d['benchmark'] = df_pca['benchmark']
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\1446170956.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pca.dropna(inplace=True)
In [14]:
pca = PCA(n_components=2)
components = pca.fit_transform(d['data'])
fig = px.scatter(components, x=0, y=1, color=d['benchmark'])
fig.show()
Preprocess data¶
Train Test Split¶
In [15]:
df_tmp = df_raw[numeric_feature_lst + ['speed_up', 'hostname', 'program']]
df_tmp.dropna(inplace=True)
df_train = df_tmp.loc[df_tmp['hostname'] != 'crunchy6']
df_test = df_tmp.loc[df_tmp['hostname'] == 'crunchy6']
print(f'df_train length {len(df_train)}, df_test length {len(df_test)}')
X_train = df_train[numeric_feature_lst]
y_train = df_train['speed_up']
X_test = df_test[numeric_feature_lst]
y_test = df_test['speed_up']
df_result = df_test[['speed_up', 'threads', 'program']]
df_train length 2011, df_test length 223
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\3988477311.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
In [16]:
X_train_random = X_train.copy()
X_test_random = X_test.copy()
X_train_random["RANDOM"] = np.random.RandomState(42).randn(X_train.shape[0])
X_test_random["RANDOM"] = np.random.RandomState(42).randn(X_test.shape[0])
In [17]:
numeric_selected_feature_lst = ['CPU(s)','cpu-cycles','host_memused','dTLB-loads','L1-dcache-loads','instructions','branch-loads',
'Stepping','CPU MHz','LLC-stores','LLC-load-misses','CPU family','BogoMIPS','branch-load-misses','context-switches','cache-misses',
'cpu-migrations','iTLB-loads','msr/mperf/','branch-instructions','task-clock','L1-dcache-prefetch-misses',
#'compute_time',
'msr/tsc/','threads','cpu-clock','branch-misses','stalled-cycles-frontend','minor-faults','L1-icache-load-misses','dTLB-load-misses',
'iTLB-load-misses','page-faults','L1-dcache-load-misses','LLC-loads','IPC','cache-references','runtime_serial','IPS',]
df_tmp_2 = df_raw[numeric_selected_feature_lst + ['speed_up', 'hostname']]
df_tmp_2.dropna(inplace=True)
df_train_limited = df_tmp_2.loc[df_tmp_2['hostname'] != 'crunchy6']
df_test_limited = df_tmp_2.loc[df_tmp_2['hostname'] == 'crunchy6']
print(f'df_train length {len(df_train)}, df_test length {len(df_test)}')
X_train_limited = df_train_limited[numeric_selected_feature_lst]
y_train_limited = df_train_limited['speed_up']
X_test_limited = df_test_limited[numeric_selected_feature_lst]
y_test_limited = df_test_limited['speed_up']
df_train length 2011, df_test length 223
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\3932590972.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Normalization¶
In [18]:
scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
Prediction¶
In [19]:
df_result
Out[19]:
| speed_up | threads | program | |
|---|---|---|---|
| 0 | 1.000000 | 1 | bfs |
| 1 | 1.445450 | 2 | bfs |
| 2 | 2.453439 | 4 | bfs |
| 3 | 3.814242 | 8 | bfs |
| 4 | 3.790494 | 16 | bfs |
| ... | ... | ... | ... |
| 0 | 1.000000 | 1 | swaptions |
| 1 | 1.944333 | 2 | swaptions |
| 2 | 3.667928 | 4 | swaptions |
| 3 | 6.387150 | 8 | swaptions |
| 4 | 12.837748 | 16 | swaptions |
223 rows × 3 columns
1) Linear Regression¶
1.1) Linear Regression with selected features¶
In [20]:
# def predict_regression(df):
# X = df[numeric_feature_lst]
# y = df['speed_up']
# # Split the DataFrame into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# # Create a scikit-learn model
# model = LinearRegression()
# # Fit the model to the training data
# model.fit(X_train, y_train)
# # Make predictions on the testing data
# y_pred = model.predict(X_test) # need to cap it >= 0
# # Evaluate the model's performance
# print('model.score', model.score(X_test, y_test))
# # print('RMSE', root_mean_squared_error(y_test, y_pred))
# print('MSE', mean_squared_error(y_test, y_pred))
# print('MAE', mean_absolute_error(y_test, y_pred))
# predict_regression(df)
In [22]:
def perform_linear_and_ridge_regression(X_train, X_test, y_train, y_test):
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
lin_reg_parameters = { 'fit_intercept': [True, False] }
lin_reg = GridSearchCV(LinearRegression(), lin_reg_parameters, cv=5)
lin_reg.fit(X=X_train, y=y_train)
y_pred = lin_reg.predict(X_test)
# print('model.score', model.score(X_test, y_test))
# print('RMSE', root_mean_squared_error(y_test, y_pred))
print('MSE', mean_squared_error(y_test, y_pred))
print('MAE', mean_absolute_error(y_test, y_pred))
# X_std = transformer.transform(X)
# X_std = pd.DataFrame(X_std, columns=X.columns)
perform_linear_and_ridge_regression(X_train, X_test, y_train, y_test)
perform_linear_and_ridge_regression(X_train_std, X_test_std, y_train, y_test)
# perform_linear_and_ridge_regression(X=X, Y=y)
MSE 18.263505764081053 MAE 2.5669553821475053 MSE 18.263509075173026 MAE 2.5669550187903982
1.2) Linear regression w/ L1 regularization¶
In [23]:
def perform_lr_l1(X_train, X_test, y_train, y_test):
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
# print('RMSE', root_mean_squared_error(y_test, y_pred))
print('MSE', mean_squared_error(y_test, y_pred))
print('MAE', mean_absolute_error(y_test, y_pred))
perform_lr_l1(X_train, X_test, y_train, y_test)
perform_lr_l1(X_train_std, X_test_std, y_train, y_test)
MSE 19.586783168604978 MAE 2.5937254893209896 MSE 22.025947893117134 MAE 2.7450394212663967
C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.827e+04, tolerance: 9.111e+00
In [24]:
# reg = LassoCV(cv=5, random_state=0).fit(X_train_std, y_train)
# y_pred = reg.predict(X_test_std)
# print('MSRE', mean_squared_error(y_test, y_pred))
# print('MAE', mean_absolute_error(y_test, y_pred))
1.3)* Linear regression w/ L2 regularization¶
In [25]:
def perform_lr_l2(X_train, X_test, y_train, y_test):
ridgecv = linear_model.RidgeCV(alphas=[0.001, 0.01, 0.1, 0.5, 1, 10], cv=5)
ridgecv.fit(X_train, y_train) # not converging is not std
y_pred = ridgecv.predict(X_test)
# print('RMSE', root_mean_squared_error(y_test, y_pred))
print('MSE', mean_squared_error(y_test, y_pred))
print('MAE', mean_absolute_error(y_test, y_pred))
df_result['lr_l2'] = y_pred
perform_lr_l2(X_train, X_test, y_train, y_test)
# perform_lr_l2(X_train_std, X_test_std, y_train, y_test)
C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=3.74671e-30): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=3.69755e-30): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=3.68756e-30): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=3.71363e-30): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=5.70789e-29): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.14919e-29): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.09523e-29): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.08419e-29): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.11305e-29): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=5.70969e-28): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.15226e-28): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.09831e-28): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.08724e-28): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.11609e-28): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=5.71426e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.07681e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.04988e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.04435e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.05876e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=2.85783e-26): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.15361e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.09981e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.08879e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.11757e-27): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=5.71561e-26): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.15022e-26): result may not be accurate.
MSE 18.263982994094224 MAE 2.5668238341663545
C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.09694e-26): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.08662e-26): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=4.11468e-26): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=5.71103e-25): result may not be accurate. C:\Users\yingl\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning: Ill-conditioned matrix (rcond=3.6716e-30): result may not be accurate. C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\2064807063.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2)* KNN¶
In [26]:
def perform_knn(X_train, X_test, y_train, y_test):
#create new a knn model
knn2 = KNeighborsRegressor()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
#fit model to data
knn_gscv.fit(X_train, y_train)
rf = KNeighborsRegressor(n_neighbors = knn_gscv.best_params_['n_neighbors'])
y_pred = knn_gscv.predict(X_test)
# print('RMSE', root_mean_squared_error(y_test, y_pred))
print('MSE', mean_squared_error(y_test, y_pred))
print('MAE', mean_absolute_error(y_test, y_pred))
df_result['knn'] = y_pred
perform_knn(X_train, X_test, y_train, y_test)
# perform_knn(X_train_std, X_test_std, y_train, y_test)
MSE 8.708802125966157 MAE 1.2657873414670053
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\2542152999.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
w/ Limited Feature¶
In [27]:
perform_knn(X_train_limited, X_test_limited, y_train, y_test)
MSE 8.743654635390525 MAE 1.2664770220474932
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\2542152999.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
3)* Random Forest - Bagging¶
In [28]:
def perform_rf(X_train, X_test, y_train, y_test):
param_grid = {
'max_depth': [5,10,20,30],
'max_features' : [5,10,20,30],
'n_estimators': [20,50]}
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=0),
param_grid=param_grid,
cv=KFold(n_splits=5, shuffle=True, random_state=1))
grid_search.fit(X_train, y_train)
rf = RandomForestRegressor(n_estimators = grid_search.best_params_['n_estimators'],
max_features = grid_search.best_params_['max_features'],
max_depth = grid_search.best_params_['max_depth'],
random_state = 0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('MSRE', mean_squared_error(y_test, y_pred))
print('MAE', mean_absolute_error(y_test, y_pred))
df_result['random_forest'] = y_pred
plt.figure(figsize=(20, 16))
global_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
global_importances.sort_values(ascending=True, inplace=True)
global_importances.plot.barh(color='green')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Global Feature Importance - Built-in Method")
perform_rf(X_train, X_test, y_train, y_test)
# perform_knn(X_train_std, X_test_std, y_train, y_test)
MSRE 0.4986699749532806 MAE 0.2769852090760171
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\4078246509.py:22: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
RF: Feature Importance w/ Random¶
In [30]:
rf_random = RandomForestRegressor(n_estimators=100, random_state=42)
rf_random.fit(X_train_random, y_train)
global_importances_random = pd.Series(rf_random.feature_importances_, index=X_train_random.columns)
global_importances_random.sort_values(ascending=True, inplace=True)
plt.figure(figsize=(20, 16))
global_importances_random.plot.barh(color='green')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Global Feature Importance - Built-in Method")
Out[30]:
Text(0.5, 1.0, 'Global Feature Importance - Built-in Method')
In [31]:
global_importances_random
Out[31]:
alignment-faults 0.000000 bpf-output 0.000000 dummy 0.000000 emulation-faults 0.000000 major-faults 0.000000 CPU MHz 0.000037 host_cpu_system 0.000088 host_cpu_idle 0.000165 RANDOM 0.000177 host_cpu_user 0.000181 Core(s) per socket 0.000307 Thread(s) per core 0.000360 NUMA node(s) 0.000386 L1-dcache-loads 0.000421 CPU family 0.000450 branch-loads 0.000520 Stepping 0.000604 instructions 0.000614 branch-instructions 0.000697 branch-load-misses 0.000793 host_memused 0.000801 context-switches 0.000995 LLC-stores 0.001073 LLC-load-misses 0.001092 cache-misses 0.001187 Socket(s) 0.001191 cpu-migrations 0.001511 BogoMIPS 0.001871 branch-misses 0.002031 Model 0.002050 cpu-clock 0.002287 CPU(s) 0.002462 task-clock 0.002567 cpu-cycles 0.003228 msr/mperf/ 0.003232 dTLB-loads 0.003769 msr/aperf/ 0.004448 iTLB-loads 0.004805 msr/tsc/ 0.005495 L1-dcache-prefetch-misses 0.005519 threads 0.006102 iTLB-load-misses 0.015322 stalled-cycles-frontend 0.016234 minor-faults 0.026734 dTLB-load-misses 0.029545 page-faults 0.034334 L1-dcache-load-misses 0.042174 L1-icache-load-misses 0.042380 LLC-loads 0.051098 cache-references 0.051360 IPC 0.086228 runtime_serial 0.153859 IPS 0.387219 dtype: float64
4) Boosting¶
4.1) Gradient Boosting¶
In [32]:
gb_reg = GradientBoostingRegressor(random_state=0)
gb_reg.fit(X_train, y_train)
y_pred = gb_reg.predict(X_test)
# print('RMSE', root_mean_squared_error(y_test, y_pred))
print('MSE', mean_squared_error(y_test, y_pred))
print('MAE', mean_absolute_error(y_test, y_pred))
df_result['gradient_boost'] = y_pred
MSE 0.5147033430154405 MAE 0.41750904198841166
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\2014355863.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Feature Importance¶
In [33]:
# print(reg.feature_importances_)
plt.figure(figsize=(20, 16))
global_importances = pd.Series(gb_reg.feature_importances_, index=X_train.columns)
global_importances.sort_values(ascending=True, inplace=True)
global_importances.plot.barh(color='green')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Global Feature Importance - Built-in Method")
Out[33]:
Text(0.5, 1.0, 'Global Feature Importance - Built-in Method')
In [34]:
xg_regressor=xgb.XGBRegressor(eval_metric='rmsle')
param_grid = {"max_depth": [4, 5, 6],
"n_estimators": [500, 600, 700],
"learning_rate": [0.01, 0.015]}
# try out every combination of the above values
search = GridSearchCV(xg_regressor, param_grid, cv=5).fit(X_train, y_train)
print("The best hyperparameters are ",search.best_params_)
xg_regressor=xgb.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
n_estimators = search.best_params_["n_estimators"],
max_depth = search.best_params_["max_depth"],
eval_metric='rmse') # mae
xg_regressor.fit(X_train, y_train)
predictions = xg_regressor.predict(X_test)
# print('RMSE', root_mean_squared_error(y_test, predictions))
print('MSE', mean_squared_error(y_test, predictions))
print('MAE', mean_absolute_error(y_test, predictions))
df_result['xgboost'] = predictions
The best hyperparameters are {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}
MSE 0.27784825211538394
MAE 0.3125122440967919
C:\Users\yingl\AppData\Local\Temp\ipykernel_8720\3078268296.py:22: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Feature Importance¶
In [52]:
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 16})
fig, ax = plt.subplots(figsize=(12,6))
plot_importance(xg_regressor, max_num_features=25, ax=ax)
plt.show()
Feature Importance w/ Random¶
In [54]:
xg_regressor_rn =xgb.XGBRegressor(eval_metric='rmsle')
param_grid = {"max_depth": [4, 5, 6],
"n_estimators": [500, 600, 700],
"learning_rate": [0.01, 0.015]}
# try out every combination of the above values
search = GridSearchCV(xg_regressor_rn, param_grid, cv=5).fit(X_train, y_train)
print("The best hyperparameters are ",search.best_params_)
xg_regressor_rn=xgb.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
n_estimators = search.best_params_["n_estimators"],
max_depth = search.best_params_["max_depth"],
eval_metric='rmsle')
xg_regressor_rn.fit(X_train_random, y_train)
predictions = xg_regressor_rn.predict(X_test_random)
# print('RMSE', root_mean_squared_error(y_test, predictions))
print('MSE', mean_squared_error(y_test, predictions))
print('MAE', mean_absolute_error(y_test, predictions))
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 16})
fig, ax = plt.subplots(figsize=(20,16))
plot_importance(xg_regressor_rn, max_num_features=100, ax=ax)
plt.show()
The best hyperparameters are {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}
MSE 0.2683949135822221
MAE 0.3113954345154314
(optional) Neural Network¶
In [37]:
clf = MLPRegressor(solver='lbfgs', alpha=1e-5,
hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
# print('RMSE', root_mean_squared_error(y_test, predictions))
print('MSE', mean_squared_error(y_test, predictions))
print('MAE', mean_absolute_error(y_test, predictions))
MSE 31.048872039430027 MAE 2.7435938129452238
Summary¶
In [42]:
print('-------------- all')
df_result = df_result[df_result['threads'] != 1]
print('MSE', mean_squared_error(df_result['speed_up'], df_result['lr_l2']))
print('MSE', mean_squared_error(df_result['speed_up'], df_result['knn']))
print('MSE', mean_squared_error(df_result['speed_up'], df_result['random_forest']))
print('MSE', mean_squared_error(df_result['speed_up'], df_result['gradient_boost']))
print('MSE', mean_squared_error(df_result['speed_up'], df_result['xgboost']))
for t in sorted(set(df_result['threads'])):
print('--------------', t)
df_result_sub = df_result[df_result['threads'] == t]
print('MSE', mean_squared_error(df_result_sub['speed_up'], df_result_sub['lr_l2']))
print('MSE', mean_squared_error(df_result_sub['speed_up'], df_result_sub['knn']))
print('MSE', mean_squared_error(df_result_sub['speed_up'], df_result_sub['random_forest']))
print('MSE', mean_squared_error(df_result_sub['speed_up'], df_result_sub['gradient_boost']))
print('MSE', mean_squared_error(df_result_sub['speed_up'], df_result_sub['xgboost']))
-------------- all MSE 19.51164386471562 MSE 9.702962276025316 MSE 0.5318719138188693 MSE 0.5765539779829469 MSE 0.31378322996781016 -------------- 2 MSE 7.844468417737685 MSE 0.9277877933879357 MSE 0.1162295640820079 MSE 0.08374960562439229 MSE 0.047253244501073675 -------------- 4 MSE 5.13044822596368 MSE 0.3928111029302665 MSE 0.04053220543663585 MSE 0.10403350420365323 MSE 0.07104175297837222 -------------- 8 MSE 5.0118548856727845 MSE 1.3718480125818695 MSE 0.06097326486965735 MSE 0.40562636024078885 MSE 0.22380752407473783 -------------- 16 MSE 12.13290984234894 MSE 8.508481445835995 MSE 0.17832863015594477 MSE 0.23690890272086354 MSE 0.21578501842581174 -------------- 32 MSE 33.202845566563795 MSE 16.80731261651492 MSE 0.3653749883292113 MSE 1.2924573756603923 MSE 0.5932685824111987 -------------- 64 MSE 44.08228812261476 MSE 25.16765275153036 MSE 1.8896037551303972 MSE 0.8613335925280674 MSE 0.17616675264735698 -------------- 128 MSE 31.581112907895733 MSE 16.000557836325406 MSE 1.1941419026047824 MSE 1.1656563197065848 MSE 0.9769111604062021
In [48]:
print('-------------- all')
print('MAE', mean_absolute_error(df_result['speed_up'], df_result['lr_l2']))
print('MAE', mean_absolute_error(df_result['speed_up'], df_result['knn']))
print('MAE', mean_absolute_error(df_result['speed_up'], df_result['random_forest']))
print('MAE', mean_absolute_error(df_result['speed_up'], df_result['gradient_boost']))
print('MAE', mean_absolute_error(df_result['speed_up'], df_result['xgboost']))
for t in sorted(set(df_result['threads'])):
print('--------------', t)
df_result_sub = df_result[df_result['threads'] == t]
print('MAE', mean_absolute_error(df_result_sub['speed_up'], df_result_sub['lr_l2']))
print('MAE', mean_absolute_error(df_result_sub['speed_up'], df_result_sub['knn']))
print('MAE', mean_absolute_error(df_result_sub['speed_up'], df_result_sub['random_forest']))
print('MAE', mean_absolute_error(df_result_sub['speed_up'], df_result_sub['gradient_boost']))
print('MAE', mean_absolute_error(df_result_sub['speed_up'], df_result_sub['xgboost']))
-------------- all MAE 2.667573259445177 MAE 1.3413427048619602 MAE 0.2979680829355335 MAE 0.44610576349537967 MAE 0.3347213256217838 -------------- 2 MAE 1.7536210212958077 MAE 0.5680787024250077 MAE 0.1330807042567625 MAE 0.17117848172591182 MAE 0.1597805287413298 -------------- 4 MAE 1.375513412935112 MAE 0.44840347939643677 MAE 0.1373471125032823 MAE 0.2240288382786713 MAE 0.20075060389925706 -------------- 8 MAE 1.632462117569985 MAE 0.8251876122168532 MAE 0.18051909681886266 MAE 0.45820210082630675 MAE 0.3363018571756266 -------------- 16 MAE 2.5043130917558134 MAE 1.701167304940691 MAE 0.267386658109303 MAE 0.3863968295135218 MAE 0.3539184928470085 -------------- 32 MAE 3.427114626713732 MAE 1.930302173023325 MAE 0.3869613590392467 MAE 0.7069233434614469 MAE 0.4775679533356462 -------------- 64 MAE 3.8101673748089047 MAE 2.328163180289381 MAE 0.5717556907478993 MAE 0.5300809770434264 MAE 0.304143629676937 -------------- 128 MAE 4.465079673212116 MAE 1.646448752986961 MAE 0.43533013857572733 MAE 0.6931769509618437 MAE 0.5463863462547994
In [ ]: